from __future__ import annotations

import argparse
from pathlib import Path

import numpy as np
import pandas as pd


def build_panel(
    owid_monthly_path: Path,
    vdem_baseline_path: Path,
    worldbank_baseline_path: Path | None,
    out_path: Path,
    min_months: int,
) -> pd.DataFrame:
    owid = pd.read_parquet(owid_monthly_path)
    vdem = pd.read_csv(vdem_baseline_path)

    merged = owid.merge(vdem, how="left", left_on="iso_code", right_on="iso3")
    if worldbank_baseline_path is not None and worldbank_baseline_path.exists():
        wb = pd.read_csv(worldbank_baseline_path)
        merged = merged.merge(wb, how="left", left_on="iso_code", right_on="iso3", suffixes=("", "_wb"))

    merged["month"] = pd.to_datetime(merged["month"])
    merged["year"] = merged["month"].dt.year.astype(int)
    merged["month_id"] = merged["month"].dt.to_period("M").astype(str)

    merged["log_gdp_pc"] = np.log(merged["gdp_per_capita"])
    merged["log_population"] = np.log(merged["population"])

    # Keep countries with at least some outcome data and enough time coverage
    counts = merged.groupby("iso_code")["month"].nunique().rename("n_months")
    keep = counts[counts >= min_months].index
    merged = merged[merged["iso_code"].isin(keep)].copy()

    out_path.parent.mkdir(parents=True, exist_ok=True)
    merged.to_parquet(out_path, index=False)
    return merged


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--owid-monthly", type=Path, default=Path("outputs/data/owid_monthly.parquet"))
    parser.add_argument("--vdem-baseline", type=Path, default=Path("outputs/data/vdem_baseline.csv"))
    parser.add_argument("--worldbank-baseline", type=Path, default=Path("outputs/data/worldbank_baseline_2019.csv"))
    parser.add_argument("--out", type=Path, default=Path("outputs/data/panel_merged.parquet"))
    parser.add_argument("--min-months", type=int, default=18)
    args = parser.parse_args()

    wb_path = args.worldbank_baseline if args.worldbank_baseline and args.worldbank_baseline.exists() else None
    merged = build_panel(args.owid_monthly, args.vdem_baseline, wb_path, args.out, args.min_months)
    print(f"Wrote {args.out} rows={len(merged):,} cols={len(merged.columns):,}")


if __name__ == "__main__":
    main()
